Lots of data exploration inspiration from Michael Griffiths: https://www.kaggle.com/msjgriffiths/exploratory-analysis/code
# Load data
animes <- read.csv('../data/clean/animes.csv', header = TRUE, stringsAsFactors = FALSE)
genres <- read.csv('../data/clean/genres.csv', header = TRUE, stringsAsFactors = FALSE)
ratings <- read.csv('../data/raw/no_null_ratings.csv', header = TRUE)
N <- 10
df <- data.frame(Anime = animes$name, Rating = animes$rating, stringsAsFactors = TRUE) # want names as factors for plotting
df <- df[order(df$Rating, decreasing = TRUE), ] # sort by ranking
df$Anime <- factor(df$Anime, levels = df$Anime) # to retain the order in plot
df <- df[1:N,] # cut off the top N
df %>% ggplot(aes(x=Anime, y=Rating)) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
coord_cartesian(ylim = c(9.0, 10.0)) +
labs(title="Top Anime Ratings",
caption="source: MAL dataset") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
animes %>%
ggplot(aes(rating, group = type)) +
labs(title = "Average Anime Ratings by Type") +
geom_density(aes(fill = type), alpha = .4) +
xlim(0, 10)
## Warning: Removed 230 rows containing non-finite values (stat_density).
animes %>%
group_by(type) %>%
summarise(
average.viewers = mean(members),
sd.viewers = sd(members),
average.rating = mean(rating, na.rm = T),
sd.rating = sd(rating, na.rm = T)
) %>%
formattable()
| type | average.viewers | sd.viewers | average.rating | sd.rating |
|---|---|---|---|---|
| 6537.400 | 13278.495 | NaN | NA | |
| Movie | 10369.094 | 30898.076 | 6.318414 | 1.2119725 |
| Music | 1311.840 | 4548.136 | 5.588996 | 0.9584401 |
| ONA | 4114.030 | 12399.959 | 5.643298 | 1.1270907 |
| OVA | 5986.140 | 15026.128 | 6.375221 | 0.8583584 |
| Special | 7676.061 | 15546.290 | 6.523501 | 0.8877620 |
| TV | 42683.658 | 89121.009 | 6.902299 | 0.8635256 |
animes_subset_ratings <- animes %>% select("anime_id", "rating")
genres_with_ratings <- inner_join(genres, animes_subset_ratings, by = "anime_id")
g <- genres_with_ratings %>%
ggplot(aes(rating, group = genre)) +
geom_density(aes(fill = genre), alpha = .4)
ggplotly(g)
## Warning: Removed 690 rows containing non-finite values (stat_density).
g <- ratings %>% ggplot(aes(x = factor(rating))) +
geom_bar() + labs(title = "Distribution of ratings")
ggplotly(g)
g <- ratings %>%
group_by(user_id) %>%
summarise(m = mean(rating)) %>%
ggplot(aes(m)) +
geom_density() +
labs(title = "Distribution of average rating over users")
ggplotly(g)
school_days.id <- filter(animes, name == "School Days")$anime_id
school_days.ratings <- filter(ratings, anime_id == school_days.id)
school_days.sd <- sd(school_days.ratings$rating)
print(school_days.sd)
## [1] 2.354353
summary(ratings$rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 7.000 8.000 7.808 9.000 10.000
ratings.sd <- sd(ratings$rating)
print(ratings.sd)
## [1] 1.572496
#school_days.ratings %>% ggplot(aes(x = bin, y = ..density.., group = source, fill = source)) +
# geom_bar(alpha = 0.5, position = 'identity')
g <- school_days.ratings %>% ggplot(aes(x = rating))
g + geom_density()
g + geom_bar() # Bar plot
ggplotly(g)
# Create a fake grouping variable, for a boxplot of 1 dim
g + geom_violin(aes(x = factor(0), y = rating), trim = FALSE, adjust = 2) +
xlab("") + scale_x_discrete(breaks = NULL)
# Yeah, looks like the # Let’s compute class rankings to find the true Weebs
# Weeb score
MAX_SCORE <- 10 # We assume students would give their favorite animes this score
MIN_SCORE <- 1 # We assume students would give their least fav animes this score
students <- c("Adriana", "Beau", "David", "Fanny", "Joe", "Kevin", "Lilly (Ralf)", "Lydia", "Mac", "Michael", "Noah", "Richard", "Roger", "Saad", "Shane", "Stephanie", "Ty", "Xiaotai")
fav_animes <- c("Psycho-Pass", "One Punch Man", "Cowboy Bebop", "", "FLCL", "Death Note", "Last Exile", "JoJo no Kimyou na Bouken (TV)", "Pokemon", "Tonari no Totoro", "Ginga Eiyuu Densetsu", "Afro Samurai", "Yuri!!! on Ice", "Dragon Ball Z", "JoJo no Kimyou na Bouken: Diamond wa Kudakenai", "Ouran Koukou Host Club", "Mushishi", "Doraemon (1979)")
calculate_culture_score <- function (fav_anime) {
anime_from_data <- filter(animes, name == fav_anime)
if (nrow(anime_from_data) == 0) { # no result
rating <- MIN_SCORE
} else {
rating <- anime_from_data$rating
}
culture.score <- (MAX_SCORE - rating)^2
#if (culture.score < 1) {
#print("Ah, I see you're a man of culture as well.")
#}
return(culture.score)
}
weeb.scores <- sapply(fav_animes, calculate_culture_score)
df <- data.frame(Student = students, Score = weeb.scores, stringsAsFactors = TRUE) # want names as factors for plotting
df <- df[order(df$Score), ,] # sort by ranking
df$Student <- factor(df$Student, levels = df$Student) # to retain the order in plot
g <- ggplot(df, aes(x=Student, y=Score, text = paste("Anime: ", rownames(df)))) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
labs(title="Class Rankings",
subtitle="Culture Score",
caption="source: In-Class Survey") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
ggplotly(g, tooltip = c("text", "x"))
df <- df[1:nrow(df)-1,] # chop off last row
g <- ggplot(df, aes(x=Student, y=Score, text = paste("Anime: ", rownames(df)))) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
labs(title="Class Rankings",
subtitle="Culture Score",
caption="source: In-Class Survey") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
ggplotly(g, tooltip = c("text", "x"))